/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
/* -*-C++-*-
 *****************************************************************************
 *
 * File:         conversionSJIS.cpp
 * RCS:          $Id: 
 * Description:  The implementation of SJIS related conversion routins
 *               
 *               
 * Created:      7/8/98
 * Modified:     $ $Date: 1998/08/10 16:00:50 $ (GMT)
 * Language:     C++
 * Status:       $State: Exp $
 *
 *
 *
 *
 *****************************************************************************
 */
// define MODULE_DEBUG when the module is to be debugged separately.
//#define MODULE_DEBUG

#include "NLSConversion.h"
#include "nawstring.h"


#ifdef MODULE_DEBUG
#include "stringBuf.cpp"
#endif


///////////////////////////////////////////////////////////////////////
// Standard SJIS code point ranges based on the book "Understanding 
// Japanese Information Processing" Table 4-9. pp. 73 and the massaged
// SJIS Unicode mapping table.  The original table is available from 
// unicode.org.
///////////////////////////////////////////////////////////////////////

inline Int32 in_range(Int32 x, Int32 lower, Int32 upper) 
{
   return ( lower <= x  &&  x <= upper ) ? 1 : 0;
}

// 1st block of single-byte characters in [0, 0x7F]
#define isSingleByteSJIS1stBlock(x) in_range(x, 0x00, 0x7F)

// 2nd block of single-byte characters in [0xA1, 0xDF]
#define isSingleByteSJIS2ndBlock(x) in_range(x, 0xA1, 0xDF)

typedef struct SJISCodeBound {
        unsigned char lower;
        unsigned char upper;
} SJISCodeBoundT;
      
// This function takes a SJIS string and returns its Unicode equivalent.
// The optional result argument holds the buffer into which the Unicode string
// will be stored. In case the result argument is NULL or the buffer it points
// at is not big enough, the function allocates memory from the heap
// (if the heap pointer is not NULL), or from the C run-time system heap
// and returns the result.
//
// If memory allocation fails, the function returns NULL.
// If any illegal characters are encountered, the function also returns NULL.
//
NAWcharBuf*
sjisToUnicode(const charBuf& sjisString, CollHeap *heap,
              NAWcharBuf*& result, NABoolean addNullAtEnd)
{

//
// These arrays are generated by the script /MiscVOB/i18n/sjis.ksh 
// and are copied into /sqlvob4/common. The seed file is 
// /MiscVOB/i18n/mx_sjis.txt which defines the Sjis to Unicode mapping. 
// Whenever a change is made to mx_sjis.txt, these arrays have to be
// regenerated by running sh sjis.ksh.
//
// Each of them (array_x for x in 0 to 4) represents the
// Unicode code values corresponding to blocks of SJIS code in a particular 
// range. The content of each array is included from a .h file whose name
// contains two hex numbers that defines the SJIS char range the array represents.
// Note the range is relevant to the leading byte of the SJIS characters only. 
// For example, sjis_81_84.h holds all Unicode chars mappable from SJIS in the 
// range from 0x81 to 0x84. 
//
// The trailing bytes of SJIS characters in each range run continuously 
// in strict ascending order, from 0x40 to 0xFC, except the "gap" characters
// at 0x7F. Such gap characters are excluded from these arrays because they are 
// not in SJIS.
//
// A few filler non-Unicode characters (0xFFFF) are purposely inserted in 
// these arrays to represent any un-defined characters in SJIS. These fillers 
// are useful in making fast algorithmic conversion possible through code point
// value manipulation. No filler character will be returned however.
//
// In each included file generated by the tool sjis.ksh, each line is in the
// format 
//     { /* SJIS code value */  Unicode code value }, /* remark */
// 
// For example
//     { /* 0x8140 */ 0x3000 }, /* IDEOGRAPHIC SPACE */
//
//
   static const NAWchar array_0[] = {
   #include "sjis_81_84.h"
   	};
   
   static const NAWchar array_1[] = {
   #include "sjis_87_9f.h"
   	};
   
   static const NAWchar array_2[] = {
   #include "sjis_e0_ea.h"
   	};
   
   static const NAWchar array_3[] = {
   #include "sjis_ed_ee.h"
   	};
   
   static const NAWchar array_4[] = {
   #include "sjis_fa_fc.h"
   	};
   
   // ranges determined by the leading byte
   static const SJISCodeBoundT SJISLeadByteBounds[] =
   {
      {0x81, 0x84}, { 0x87, 0x9F}, {0xE0, 0xEA}, {0xED, 0xEE}, {0xFA, 0xFC},
      {0x0, 0xFF } // catch all
   };

   // blocks determined by the trailing byte
   static const SJISCodeBoundT SJISTrailByteBounds[] =
   {
      {0x40, 0x7E}, {0x80, 0xFC}, 
      {0x0, 0xFF } // catch all
   };
 

   unsigned char* source = sjisString.data();
   Int32 sourceLen = sjisString.getStrLen();

   // the output Unicode string will have at most sjisString.length() 
   // characters. An extra char may be added depending on addNullAtEnd.
   NAWcharBuf* output = checkSpace(heap, sjisString.getStrLen(), result, addNullAtEnd);

   if ( output == 0 )
      return 0;

   NAWchar *base, *target;
   base = target = output -> data();

   unsigned char c, d;
   NAWchar u;
   Int32 i=0;

   while ( i < sourceLen ) {
		
      c = source[i++];

      if ( isSingleByteSJIS1stBlock(c) ) 
	u = (NAWchar)c;                 // found in the first single-byte block
      else 
      if ( isSingleByteSJIS2ndBlock(c) )
	u = (NAWchar)c - 0xA1 + 0xFF61; // found in the 2nd single-byte block
      else {

        // the second byte does not exist!
        if ( i == sjisString.getStrLen() ) {
          return 0;
        }

        // get the trailing byte
	d = source[i++];
         
        // is d a bad trailing byte?
        if ( d == 0x7F || d == 0xFD || d == 0xFE || d == 0xFF ) {
           return 0;
        }
        
        Int32 lead, trail;

        // find the range in which c is in.
        for ( lead = 0; lead<=4; lead++ ) 
           if ( in_range(c, SJISLeadByteBounds[lead].lower, 
                            SJISLeadByteBounds[lead].upper))
              break;

        // find the first or the second block for c.
        for (trail=0; trail<=2; trail++ ) 
           if ( in_range(d, SJISTrailByteBounds[trail].lower,
	                    SJISTrailByteBounds[trail].upper ))
              break;
    
        // out of range of SJIS legal code values
        if ( (c == 0xEA && d >= 0xA4) ||
             (c == 0xFC && d >= 0x4C) ||
             (lead == 5 ) || (trail == 2) 
           )
        {
	   return 0;
        }
         

        // Each chunk (all chars with identical lead byte)
        // has 16 * 18 = 192 chars. But because of the missing 
        // ones at 0x7F, 0xFD, 0xFE and 0xFF tailing byte, 
        // each chunk contains only 192 - 4 = 188 chars.
        //
        // In additional, a char with greater than 0x7F 
        // tailing byte (say, d) should be mapped to the entry 
        // with the index (d-1) instead of (d) in the right chunk. 
        // We do this by using the expression "- trail".
        switch (lead) {
            case 0:
	      u = array_0[(c-0x81) * 188 + d - 0x40 - trail];
              break;
            case 1:
	      u = array_1[(c-0x87) * 188 + d - 0x40 - trail];
              break;
            case 2:
	      u = array_2[(c-0xe0) * 188 + d - 0x40 - trail]; 
              break;
	    case 3:
	      u = array_3[(c-0xed) * 188 + d - 0x40 - trail];
              break;
            default:
              u = array_4[(c-0xfa) * 188 + d - 0x40 - trail];
	}
      }

      if ( u == 0xFFFF ) // filler chars are not defined in SJIS standard
         return 0;

      *target = u;
      target++;
   }
		
   Int32 finalLength = target-base;

   if ( addNullAtEnd == TRUE )
      (output -> data())[finalLength] = 0;

   output -> setStrLen(finalLength);
   return output;
}

typedef struct Unicode2SjisMap
{
   NAWchar Unicode;
   NAWchar sjis;
} Unicode2SjisMapT;

//
// Using the binary search method to find the SJIS code for a Unicode character
// contained in argument u. The SJIS code is returned in the argument sjis. The
// function returns TRUE if the conversion is sucessful, FALSE otherwise.
//
// The function only returns the SJIS that are double-byte.  Hense it is static.
//
static
NABoolean binarySearchU2STable(NAWchar u, NAWchar& sjis)
{


//
// This array contains the Unicode to SJIS mapping with. Each line in 
// the include file is in the format
//
// { Unicode_code_value, SJIS_code_value }, /* remark */
//
// It is assumed the Unicode_code_value column is in strict ascending order.
//
   static const Unicode2SjisMapT array_u2s[] = {
      #include "sjis_from_ucs2.h"
   };

   Int32 lowerLimit = 0;
   Int32 upperLimit = sizeof(array_u2s)/sizeof(Unicode2SjisMapT) - 1;
   Int32 middle = 0;

   do {
     middle = (lowerLimit + upperLimit ) / 2;

     if ( array_u2s[middle].Unicode == u ) {
        sjis = array_u2s[middle].sjis;
        return TRUE;
     }


     if ( array_u2s[middle].Unicode < u )
        lowerLimit = middle+1;
     else
        upperLimit = middle-1;

   } while ( lowerLimit <= upperLimit );

   return FALSE;
}

//
// Convert a single Unicode character (in argument wc) into SJIS multibyte
// format. The result is placed into the sjis argument. The number of SJIS 
// bytes is returned by the function. The function returns 0 if the 
// Unicode character is not mappable from SJIS.
//
Int32 unicodeToSjisChar(NAWchar wc, unsigned char *sjis, NABoolean allowInvalidCodePoint)
{
   NAWchar t;

   // single-byte SJIS characters, in [0, 0x7F]. The Unicode range is
   // also [0, 0x007F]
   if ( in_range(wc, 0, 0x007F) ) {
      sjis[0] = (unsigned char)wc;
      return 1; 
   }

   // single-byte SJIS characters, in [0xA1, 0xDF]. The Unicode range
   // is [0xFF61, 0xFF9F]
   if ( in_range(wc, 0xFF61, 0xFF9F) ) {
      sjis[0] = (unsigned char)(wc - 0xFF61 + 0xA1);
      return 1; 
   }

   // double byte
   if ( binarySearchU2STable(wc, t) && t >= 0x8140 ) {
      sjis[0] = (unsigned char)((t>>8) & (0xFF));
      sjis[1] = (unsigned char)(t & (0xFF));
      return 2;
   }

   // bad case
   if ( allowInvalidCodePoint == FALSE )
      return 0;
   else {
      sjis[0] = '?';
      return 1;
   }
}

// Unicode to SJIS conversion. 
//
// This function takes a Unicode string and returns its SJIS equivalent.
// The optional sjisString argument holds the buffer into which the Unicode string
// will be stored. In case the argument is NULL or it is not big enough, 
// the function allocates memory from the heap (if the heap pointer is not NULL), 
// or from the C run-time system heap.
// If the memory allocation fails, the function returns 0. If any illegal 
// characters are encountered, the function also returns 0.
//
charBuf* unicodeToSjis(const NAWcharBuf& unicodeString, CollHeap *heap,
                       charBuf*& sjisString, NABoolean addNullAtEnd,
                       NABoolean allowInvalidCodePoint)
{
   NAWchar* source = unicodeString.data();
   Int32 sourceLen = unicodeString.getStrLen();

   // the output Unicode string will have at most 2*unicodeString.length() 
   // characters. An extra char may be added depending on addNullAtEnd.
   charBuf* output = checkSpace(heap, BYTES_PER_NAWCHAR*unicodeString.getStrLen(), sjisString, addNullAtEnd);

   if ( output == 0 )
      return 0;
  
   unsigned char *base, *target;
   base = target = (unsigned char*) (output -> data());

   Int32 ct = 0;
   for ( Int32 i=0; i<sourceLen; i++ ) {

      ct = unicodeToSjisChar(source[i], target, allowInvalidCodePoint);

      if ( ct == 0 ) {
        if ( sjisString == NULL ) {
           if (heap)
             NADELETE(output, charBuf, heap);
           else
             delete output;
        }
        return NULL;
      } else
        target += ct;
   }

   Int32 finalLength = target - base;

   if ( addNullAtEnd == TRUE )
      (output -> data())[finalLength] = 0;

   output -> setStrLen(finalLength);
   return output;
}

#ifdef MODULE_DEBUG

//
// testing: dumping all Unicode codes for Sjis chars.
// Use the tool /MiscVOB/i18n/verify.pl to see the
// difference between the dumping and mx_sjis.txt.l2u.
// Should only see complaining about 0xFFFF.
//

static
Int32 leadbyte[] = {
        0x81, 0x82, 0x83, 0x84, 0x87, 0x88,
        0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
        0x8f,
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 
        0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
        0x9e, 0x9f,
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
        0xe7, 0xe8, 0xe9, 0xea, 0xed, 0xee, 0xfa, 
        0xfb, 0xfc
   };

void sjisDump()
{
   charBuf sjis(2);
   NAWcharBuf* unicode = new NAWcharBuf(10);

   for ( Int32 i=0; i<0xFF; i++ ) {
      sjis.data()[0] = i;
      sjis.setLength(1);

      unicode = sjisToUnicode(sjis, 0, unicode);
      if ( unicode )
         printf("0x%X  0x%X\n", i, unicode->data()[0]);
   }
	
   for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
      for (Int32 j=0x40; j<=0xFF; j++ ) {
         sjis.data()[0] = leadbyte[i];
         sjis.data()[1] = j;
         sjis.setLength(2);

         unicode = sjisToUnicode(sjis, 0, unicode);
         if ( unicode )
            printf("0x%X%X  0x%X\n", leadbyte[i], j, unicode->data()[0]);
      }

   delete unicode;
}

// Roundtrip verification. 
//
// Perform a round-trip conversion testing for each valid SJIS.
//
// should not see any warnings.
//
void sjis2Unicode2sjis()
{
   printf("SJIS round-trip testing: ");

   charBuf sjis(2);
   NAWcharBuf* unicode = new NAWcharBuf(10);
   charBuf* remapped_sjis = new charBuf(10);

   // do the test for single-byte chars
   for ( Int32 i=0; i<0xDF; i++ ) {

      // skip non-SJIS chars
      if ( in_range(i, 0x80, 0xA0) )
         continue;

      sjis.data()[0] = i;
      sjis.setLength(1);

      unicode = sjisToUnicode(sjis, 0, unicode);

      if ( unicode == NULL ) {
         printf("SJIS to Unicode mapping failed: 0x%X.\n", i);
         return;
      }

      remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);

      if ( remapped_sjis == NULL ) {
        printf("Unicode 0x%X can not be mapped to SJIS\n", 
               unicode->data()[0]);
        return;
      }

      if ( remapped_sjis->data()[0] != sjis.data()[0] ) {
        printf("Roundtrip mapping failed: 0x%X  0x%X\n", 
               sjis.data()[0], remapped_sjis->data()[0]
              );
        return;
      }
   }
	
   // do the test for double-byte chars
   for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
      for (Int32 j=0x40; j<=0xFF; j++ ) {

         sjis.data()[0] = leadbyte[i];
         sjis.data()[1] = j;
         sjis.setLength(2);

         unicode = sjisToUnicode(sjis, 0, unicode);
         if ( unicode == NULL ) {
            continue; // skip non-exist chars 
         }

         remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);

         if ( remapped_sjis == NULL ) {
            printf("Unicode 0x%X can not be mapped to SJIS\n", 
                   unicode->data()[0]);
            return;
         }

         if ( remapped_sjis->data()[0] != sjis.data()[0] || 
              remapped_sjis->data()[1] != sjis.data()[1] 
            ) {
           printf("Roundtrip mapping failed: sjis=0x%X%X  unicode=0x%X, remapped_sjis=0x%X%X\n", 
                  sjis.data()[0], sjis.data()[1],
                  unicode->data()[0],
                  remapped_sjis->data()[0], remapped_sjis->data()[1]
                 );
           return;
         }
      }

   delete unicode;
   delete remapped_sjis;

   printf("OK\n");
}

//
// Testing: assure all SJIS-mappable Unicode chars can be mapped
// to SJIS. Should not see any "Bad code ..." output.
//
void UCS2ToSjis()
{
   printf("Unicode to SJIS testing: ");

   NAWchar sjisChar;
   NAWcharBuf unicode(1);
   charBuf* sjis = new charBuf(10);

   Int32 n = sizeof(array_u2s)/sizeof(Unicode2SjisMapT);
   for ( Int32 i=0; i<n; i++ )
   {
      unicode.data()[0] = array_u2s[i].Unicode;
      unicodeToSjis(unicode, 0, sjis);

      switch (sjis->length()) {
         case 2:
            sjisChar = (NAWchar)(sjis->data()[1]);
            sjisChar |= (NAWchar)((sjis->data()[0] <<8));
            break;
         default:
            printf("Bad code 0x%X\n", array_u2s[i].Unicode);
            return;
      }
      if ( sjisChar != array_u2s[i].sjis ) {
         printf("Bad code 0x%x for 0x%X\n", sjisChar, array_u2s[i].Unicode);
         return;
      }
   }

   for ( i=0; i<=0x7F; i++ )
   {
     unicode.data()[0] = (NAWchar)i;
     unicodeToSjis(unicode, 0, sjis);

     switch (sjis->length()) {
        case 1:
           sjisChar = (NAWchar)sjis->data()[0];
           break;
        default:
           printf("Bad code 0x%X\n", i);
           return;
     }
     if ( sjisChar != i ) {
        printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
        return;
     }
   }

   for ( i=0xFF61; i<=0xFF9F; i++ )
   {
     unicode.data()[0] = (NAWchar)i;
     unicodeToSjis(unicode, 0, sjis);

     switch (sjis->length()) {
        case 1:
           sjisChar = (NAWchar)sjis->data()[0];
           break;
        default:
           printf("Bad code 0x%X\n", i);
           return;
     }
     if ( sjisChar != i - 0xFF61 + 0xA1 ) {
        printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
        return;
     }
   }
   printf("OK\n");
}

//
// Define MODULE_DEBUG at the beginning of this file and select one 
// of the following to test.  Build the test app with the command:
//
// cl /nologo /Zp4 /W3 /GX /Od /MDd /D "_DEBUG" /D "NA_WINNT" /Z7 \
// -o a.exe conversionsjis.cpp
//
Int32 main(Int32 argc, char** argv)
{
   //sjisDump();
   sjis2Unicode2sjis();
   UCS2ToSjis();
   return 0;
}

#endif
